import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from openpyxl import load_workbook
import start

# %%

RESULTS_FILE = start.MAIN_DIR + "results/performance_characters_dev.xlsx"

CLASSIFICATION_FILE = (
    start.MAIN_DIR + "data/clean/gpt_classifications_characters_dev.xlsx"
)
ANNOTATIONS_FILE = start.MAIN_DIR + "data/clean/character_classifications_gold.xlsx"
gold = pd.read_excel(ANNOTATIONS_FILE)
gold = gold[["unique_id", "character_gold", "set"]]

# %%
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np


def bootstrap_f1(y_true, y_pred, n_bootstraps=1000, random_state=12):
    rng = np.random.default_rng(seed=random_state)
    n = len(y_true)
    scores = []

    for _ in range(n_bootstraps):
        indices = rng.integers(0, n, n)
        y_true_boot = np.array(y_true)[indices]
        y_pred_boot = np.array(y_pred)[indices]
        scores.append(f1_score(y_true_boot, y_pred_boot))

    scores = np.array(scores)
    return scores.mean(), scores.std(ddof=1)


def bootstrap_f1_micro(y_true, y_pred, n_bootstraps=1000, random_state=12):
    rng = np.random.default_rng(seed=random_state)
    n = len(y_true)
    scores = []

    for _ in range(n_bootstraps):
        indices = rng.integers(0, n, n)
        y_true_boot = np.array(y_true)[indices]
        y_pred_boot = np.array(y_pred)[indices]
        scores.append(f1_score(y_true_boot, y_pred_boot, average="micro"))

    scores = np.array(scores)
    return scores.mean(), scores.std(ddof=1)


# %%

wb = load_workbook(CLASSIFICATION_FILE)
sheet_names = wb.sheetnames

wb = load_workbook(RESULTS_FILE)

row = 2
start_result_col = 2

for PROMPT_NAME in sheet_names:
    if PROMPT_NAME.startswith("Sheet"):
        continue

    gpt = pd.read_excel(
        start.MAIN_DIR + "data/clean/gpt_classifications_characters_dev.xlsx",
        sheet_name=PROMPT_NAME,
    )
    gpt = gpt[["unique_id", "response"]]
    df = gold.merge(gpt, on="unique_id", how="inner")
    df["gold_standard"] = df.character_gold
    df["gpt_classification"] = df.response
    df = df[df.set == "dev"]

    for character in ["hero", "villain", "victim", "other"]:
        df["gold_" + character] = np.where(df.gold_standard == character, 1, 0)
        df["gpt_" + character] = np.where(df.gpt_classification == character, 1, 0)

    # compute macroaveraged f1 score for hero, villain, victim
    f1_hero = f1_score(df.gold_hero, df.gpt_hero)
    f1_villain = f1_score(df.gold_villain, df.gpt_villain)
    f1_victim = f1_score(df.gold_victim, df.gpt_victim)

    # calculate microaveraged f1 score for hero, villain, victim
    f1_micro = f1_score(
        df[["gold_hero", "gold_villain", "gold_victim"]],
        df[
            [
                "gpt_hero",
                "gpt_villain",
                "gpt_victim",
            ]
        ],
        average="micro",
    )
    _, f1_hero_se = bootstrap_f1(df.gold_hero, df.gpt_hero)
    _, f1_villain_se = bootstrap_f1(df.gold_villain, df.gpt_villain)
    _, f1_victim_se = bootstrap_f1(df.gold_victim, df.gpt_victim)
    _, f1_micro_se = bootstrap_f1_micro(
        df[["gold_hero", "gold_villain", "gold_victim"]],
        df[["gpt_hero", "gpt_villain", "gpt_victim"]],
    )
    ws = wb["results"]

    ws.cell(row=row, column=1, value=PROMPT_NAME)
    col = start_result_col
    for metric in [f1_hero, f1_villain, f1_victim, f1_micro]:
        ws.cell(row=row, column=col, value=metric.round(2))
        col += 1

    row = row + 1
    col = start_result_col
    for metric in [f1_hero_se, f1_villain_se, f1_victim_se, f1_micro_se]:
        ws.cell(row=row, column=col, value=f"({metric.round(2)})")
        col += 1
    row = row + 1
wb.save(RESULTS_FILE)
